# load data sets
full_data <- tibble()
names <- c(ws = 'windSpeed', wd = 'windDirection', H2S = 'Value', QC.Code = 'qcCode',
Monitor = 'siteName', MinDist = 'NEAR_DIST')
for (file in list.files('data/raw_csv')){
new_data <- read.csv(paste0('data/raw_csv/', file)) %>% rename(any_of(names))
print(sum(is.na(new_data$H2S)))
full_data <- bind_rows(full_data, new_data)
}
## [1] 112
## [1] 0
## [1] 9398
## [1] 9771
## [1] 32036
## [1] 11599
## [1] 12440
## [1] 11914
## [1] 18019
## [1] 11424
## [1] 0
## [1] 23974
## [1] 0
remove(new_data)
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 3638185 194.4 6079797 324.7 4065484 217.2
## Vcells 102610639 782.9 223961814 1708.7 199248175 1520.2
full_data <- full_data %>%
mutate(DateTime = case_when(Monitor %in% c('North HS', 'West HS', 'Elm Ave') ~ parse_date_time(DateTime, 'ymd HMS'),
.default = parse_date_time(DateTime, 'mdy HM'))) %>%
mutate(DateTime = as.POSIXct(DateTime, "%Y-%m-%d %H:%M:%OS", tz = 'UTC'))
## Warning: There were 2 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `DateTime = case_when(...)`.
## Caused by warning:
## ! 2031728 failed to parse.
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning.
full_data <- full_data %>%
mutate(day = as.POSIXct(format(DateTime, "%Y-%m-%d"), format="%Y-%m-%d", tz = 'UTC'))
# Remove observations before 2020
full_data <- full_data %>% filter(day > '2019-12-31') %>% distinct()
full_data <- full_data %>%
mutate(yearmonth = format(day, "%Y-%m"),
year = format(day, "%Y"),
month = format(day, "%m"),
weekday = relevel(factor(wday(full_data$day, label=TRUE), ordered = FALSE), ref = "Sun"))
klax_data <- read_csv('data/KLAX_2020-2023.csv', show_col_types = FALSE) %>%
filter(!row_number() == 1)
klax_daily <- klax_data %>%
group_by(Date) %>%
summarise(avg_temp = mean(as.numeric(air_temp), na.rm=T),
avg_hum = mean(as.numeric(relative_humidity), na.rm=T),
precip = precip_accum_24_hour[which(!is.na(precip_accum_24_hour))[1]],
ws = mean(as.numeric(wind_speed), na.rm=TRUE),
wd = as.numeric(mean(circular(as.numeric(wind_direction), units = 'degrees'), na.rm=TRUE))) %>%
mutate(precip = coalesce(as.numeric(precip), 0),
Date = parse_date_time(Date, c('m/d/y')),
Date = as.POSIXct(Date, "%Y-%m-%d", tz = 'UTC'))
full_data <- rbind(full_data %>%
filter(!(Monitor %in% c('El Segundo', 'St. Anthony'))),
full_data %>%
select(-c(wd, ws)) %>%
filter(Monitor %in% c('El Segundo', 'St. Anthony')) %>%
left_join(klax_daily %>% select(Date, ws, wd), join_by(day == Date)))
# Remove rows without wd or ws
full_data <- full_data %>%
drop_na(c("wd", "ws"))
# Check for duplicate time
# full_data %>%
# group_by(DateTime, Monitor) %>%
# summarise(n = n()) %>%
# filter(n > 1)
We observe multiple rows for the same date time and monitor. From a quick glance, this could be a conversion issue from Date.Time to DateTime, since the H2S measurements are different. In other occasions, multiple rows exist as complements, where one row contains data on the missing values of the other. It also appears that some times we get different measurements for the same time when it’s not daylight saving conversion days?
# full_data %>%
# group_by(DateTime, Monitor) %>%
# summarise(n = n()) %>%
# filter(n > 1) %>%
# mutate(day = as.POSIXct(format(DateTime, "%Y-%m-%d"), format="%Y-%m-%d", tz = 'UTC')) %>%
# group_by(day, Monitor) %>%
# summarise(n = n())
# Deal with duplicate rows for same time
# For the 213 monitor it seems like the duplicate rows are complements to the other
#https://stackoverflow.com/questions/45515218/combine-rows-in-data-frame-containing-na-to-make-complete-row
# coalesce_by_column <- function(df) {
# return(dplyr::coalesce(!!! as.list(df)))
# }
#
# data_213 <- full_data %>%
# filter(Monitor == '213th & Chico') %>%
# group_by(DateTime) %>%
# summarise_all(coalesce_by_column)
#
# full_data <- full_data %>%
# filter(Monitor != '213th & Chico') %>%
# rbind(data_213)
glimpse(full_data)
## Rows: 3,315,509
## Columns: 32
## $ DateTime <dttm> 2021-10-22 15:15:00, 2021-10-22 15:20:00, 2021-10-…
## $ Date.Time <chr> "10/22/2021 15:15", "10/22/2021 15:20", "10/22/2021…
## $ DaylightSavings <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ H2S <dbl> 138.79, 183.26, 149.24, 263.18, 308.97, 169.06, 234…
## $ Unit <chr> "ppb", "ppb", "ppb", "ppb", "ppb", "ppb", "ppb", "p…
## $ QC.Code <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ MDL <dbl> 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0…
## $ Averaging.Hour <chr> "5 Min", "5 Min", "5 Min", "5 Min", "5 Min", "5 Min…
## $ Monitor <chr> "213th & Chico", "213th & Chico", "213th & Chico", …
## $ latitude <dbl> 33.83678, 33.83678, 33.83678, 33.83678, 33.83678, 3…
## $ longitude <dbl> -118.2586, -118.2586, -118.2586, -118.2586, -118.25…
## $ ws <dbl> 7.01, 6.32, 6.73, 6.54, 8.02, 7.82, 6.99, 6.71, 6.3…
## $ wd <dbl> 276.87, 250.15, 263.97, 280.36, 269.26, 256.21, 264…
## $ wd_QCcode <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ ws_QCcode <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ Join_Count <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ MinDist <dbl> 2873.053, 2873.053, 2873.053, 2873.053, 2873.053, 2…
## $ Converted_Angle <int> 306, 306, 306, 306, 306, 306, 306, 306, 306, 306, 3…
## $ Refinery <chr> "Marathon (Carson)", "Marathon (Carson)", "Marathon…
## $ siteId <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ unitName <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ qcName <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ opName <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ windSpeed_Unit <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ windDirection_Unit <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ opCode <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ Source <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ day <dttm> 2021-10-22, 2021-10-22, 2021-10-22, 2021-10-22, 20…
## $ yearmonth <chr> "2021-10", "2021-10", "2021-10", "2021-10", "2021-1…
## $ year <chr> "2021", "2021", "2021", "2021", "2021", "2021", "20…
## $ month <chr> "10", "10", "10", "10", "10", "10", "10", "10", "10…
## $ weekday <fct> Fri, Fri, Fri, Fri, Fri, Fri, Fri, Fri, Fri, Fri, F…
summary(full_data)
## DateTime Date.Time DaylightSavings
## Min. :2020-01-01 00:00:00.00 Length:3315509 Min. :0
## 1st Qu.:2021-04-29 07:15:00.00 Class :character 1st Qu.:0
## Median :2021-12-13 21:30:00.00 Mode :character Median :0
## Mean :2021-12-08 01:00:39.75 Mean :0
## 3rd Qu.:2022-08-21 17:15:00.00 3rd Qu.:0
## Max. :2023-05-21 00:55:00.00 Max. :1
## NA's :1064847
## H2S Unit QC.Code MDL
## Min. : 0.00 Length:3315509 Min. : 0 Min. :0.4
## 1st Qu.: 0.22 Class :character 1st Qu.: 0 1st Qu.:0.4
## Median : 0.60 Mode :character Median : 0 Median :0.4
## Mean : 1.13 Mean : 0 Mean :0.4
## 3rd Qu.: 1.11 3rd Qu.: 0 3rd Qu.:0.4
## Max. :3817.24 Max. :121 Max. :0.4
## NA's :102956 NA's :461526 NA's :1167803
## Averaging.Hour Monitor latitude longitude
## Length:3315509 Length:3315509 Min. :33.78 Min. :-118.4
## Class :character Class :character 1st Qu.:33.79 1st Qu.:-118.3
## Mode :character Mode :character Median :33.82 Median :-118.3
## Mean :33.84 Mean :-118.3
## 3rd Qu.:33.86 3rd Qu.:-118.3
## Max. :33.92 Max. :-118.2
## NA's :301 NA's :301
## ws wd wd_QCcode ws_QCcode
## Min. : 0.000 Min. :-179.5 Min. :0 Min. :0
## 1st Qu.: 1.780 1st Qu.: 101.7 1st Qu.:0 1st Qu.:0
## Median : 3.610 Median : 213.2 Median :0 Median :0
## Mean : 4.164 Mean : 182.1 Mean :0 Mean :0
## 3rd Qu.: 6.147 3rd Qu.: 282.0 3rd Qu.:0 3rd Qu.:0
## Max. :57.780 Max. : 360.0 Max. :7 Max. :8
## NA's :1064847 NA's :1064847
## Join_Count MinDist Converted_Angle Refinery
## Min. :1.000 Min. : 758.7 Min. : 11.0 Length:3315509
## 1st Qu.:1.000 1st Qu.:1308.3 1st Qu.:189.0 Class :character
## Median :1.000 Median :1988.2 Median :209.0 Mode :character
## Mean :1.912 Mean :1948.0 Mean :205.5
## 3rd Qu.:3.000 3rd Qu.:2541.9 3rd Qu.:263.0
## Max. :5.000 Max. :3592.8 Max. :338.0
##
## siteId unitName qcName opName
## Length:3315509 Length:3315509 Length:3315509 Length:3315509
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## windSpeed_Unit windDirection_Unit opCode Source
## Length:3315509 Length:3315509 Min. :0 Length:3315509
## Class :character Class :character 1st Qu.:0 Class :character
## Mode :character Mode :character Median :0 Mode :character
## Mean :0
## 3rd Qu.:0
## Max. :0
## NA's :2609232
## day yearmonth year
## Min. :2020-01-01 00:00:00.00 Length:3315509 Length:3315509
## 1st Qu.:2021-04-29 00:00:00.00 Class :character Class :character
## Median :2021-12-13 00:00:00.00 Mode :character Mode :character
## Mean :2021-12-07 13:03:15.26
## 3rd Qu.:2022-08-21 00:00:00.00
## Max. :2023-05-21 00:00:00.00
##
## month weekday
## Length:3315509 Sun:471574
## Class :character Mon:466861
## Mode :character Tue:472058
## Wed:475719
## Thu:476384
## Fri:476201
## Sat:476712
Some concerns for the data include: For TorranceAir_H2SWind, there are actually three sites, but only one site contains ws/wd information What’s the difference between DateTime and Date/Time? DateTime would be the time after converting What’s the difference between Date/Time and Date/Time.First? The latter is present in FirstMethodist For missing H2S, it could be missing due to mismatch from merging, or flagged due to issues such as low detection (minimm detection limit: 0.4)
# For Judson, it appeared that longitude had some significant figure issues
full_data <- full_data %>%
mutate(longitude = case_when(Monitor == 'Judson' ~ -118.268128,
Monitor == 'West HS' ~ -118.36836539131416,
Monitor == 'Elm Ave' ~ -118.3316298600153,
Monitor == 'North HS' ~ -118.33660803949037,
Monitor == 'G Street' ~ -118.2818168,
Monitor == 'Guenser Park' ~ -118.313499,
Monitor == 'Harbor Park' ~ -118.286028,
.default = longitude),
latitude = case_when(Monitor == 'West HS' ~ 33.85032033929173,
Monitor == 'Elm Ave' ~ 33.84048285612423,
Monitor == 'North HS' ~ 33.86785391284254,
Monitor == 'G Street' ~ 33.77785794,
Monitor == 'Guenser Park' ~ 33.869231,
Monitor == 'Harbor Park' ~ 33.786252,
.default = latitude))
# For observations below MDL, set it to half of MDL
# As confirmed, MDL for all monitors is 0.4
# Check how many observations are below MDL
full_data %>%
group_by() %>%
summarise('Below MDL' = sum(full_data$H2S < 0.4, na.rm=T),
'Proportion' = sum(full_data$H2S < 0.4, na.rm=T) / n())
full_data <- full_data %>%
mutate(H2S = if_else(H2S < 0.4, 0.2, H2S))
# Compute daily max
# -Inf (all NA) is converted to NA for that day
H2S_dm <- full_data %>%
group_by(day, Monitor) %>%
summarise(H2S_daily_max = max(H2S, na.rm = TRUE)) %>%
mutate(H2S_daily_max = if_else(H2S_daily_max == -Inf, NA, H2S_daily_max))
## Warning: There were 171 warnings in `summarise()`.
## The first warning was:
## ℹ In argument: `H2S_daily_max = max(H2S, na.rm = TRUE)`.
## ℹ In group 1660: `day = 2021-01-19`, `Monitor = "G Street"`.
## Caused by warning in `max()`:
## ! no non-missing arguments to max; returning -Inf
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 170 remaining warnings.
## `summarise()` has grouped output by 'day'. You can override using the `.groups`
## argument.
full_data <- full_data %>%
left_join(H2S_dm, by = c('day', 'Monitor'))
H2S_ma <- H2S_dm %>%
mutate(yearmonth = format(day, "%Y-%m")) %>%
group_by(yearmonth, Monitor) %>%
summarise(H2S_monthly_average = mean(H2S_daily_max, na.rm = TRUE))
## `summarise()` has grouped output by 'yearmonth'. You can override using the
## `.groups` argument.
full_data <- full_data %>%
left_join(H2S_ma, by = c('yearmonth', 'Monitor'))
remove(H2S_ma, H2S_dm)
# Compute daily average
H2S_da <- full_data %>%
group_by(day, Monitor) %>%
summarise(H2S_daily_avg = mean(H2S, na.rm=TRUE))
## `summarise()` has grouped output by 'day'. You can override using the `.groups`
## argument.
full_data <- full_data %>%
left_join(H2S_da, by = c('day', 'Monitor'))
remove(H2S_da)
full_data <- full_data %>%
mutate(wd_sec = case_when(0 <= wd & wd < 90 ~ 'Q1',
90 <= wd & wd < 180 ~ 'Q2',
180 <= wd & wd < 270 ~ 'Q3',
270 <= wd & wd <= 360 ~ 'Q4'))
la_wrp <- read_sf('shapefiles/LA_WRP.shp', layer = 'LA_WRP')
CRS_UTM <- CRS("+proj=utm +zone=11 ellps=WGS84")
# Convert both water treatment plants and monitor coordinates to UTM
la_wrp <- st_transform(la_wrp, CRS_UTM)
monitors <- full_data %>%
select(Monitor, latitude, longitude) %>%
distinct() %>%
st_as_sf(coords = c('longitude', 'latitude')) %>%
st_set_crs(4326) %>%
st_transform(CRS_UTM)
# get nearest wrp for each monitor
monitor_wrp <- monitors %>%
mutate(wrp_name = la_wrp$CWP_NAM[st_nearest_feature(monitors, la_wrp)],
dist_wrp = apply(st_distance(monitors, la_wrp), 1, min),
capacity = la_wrp$cpcty_m[st_nearest_feature(monitors, la_wrp)])
# we also want to find the angle between the monitor and wrp
angles <- tibble(monitors$geometry, la_wrp$geometry[st_nearest_feature(monitors, la_wrp)]) %>%
pivot_longer(cols = everything()) %>%
pull(value) %>% # extract coordinates only
st_transform(4326) %>% # convert to lat/lon for function below
st_geod_azimuth() %>%
set_units('degrees') %>% # convert to degrees
drop_units()
angles <- angles[c(T, F)] # keep only odd index, valid pairs
angles <- if_else(angles < 0, angles + 360, angles)
monitor_wrp$wrp_angle <- angles
full_data <- full_data %>%
left_join(tibble(monitor_wrp) %>% select(-geometry), join_by('Monitor'))
# Read in the shape file, it already has a CRS
st_read('shapefiles/DominguezChannel_Carson.shp')
## Reading layer `DominguezChannel_Carson' from data source
## `D:\313\Year4Uni\Reading Course\H2S-study\shapefiles\DominguezChannel_Carson.shp'
## using driver `ESRI Shapefile'
## Simple feature collection with 9 features and 17 fields
## Geometry type: LINESTRING
## Dimension: XYZ
## Bounding box: xmin: -118.3382 ymin: 33.77701 xmax: -118.2275 ymax: 33.92881
## z_range: zmin: 0 zmax: 0
## Geodetic CRS: WGS 84
d_channel <- read_sf('shapefiles/DominguezChannel_Carson.shp', layer = 'DominguezChannel_Carson')
d_channel <- st_transform(d_channel, CRS_UTM) # convert to UTM crs
monitor_d_channel <- monitors %>%
mutate(dist_dc = apply(st_distance(monitors, d_channel), 1, min),
dist_213 = c(drop_units(st_distance(monitors,
monitors[monitors$Monitor == '213th & Chico',]))))
full_data <- full_data %>%
left_join(tibble(monitor_d_channel) %>% select(-geometry), join_by(Monitor))
dist_plot <- d_channel %>%
ggplot() +
geom_sf() +
geom_sf(data = monitors) +
geom_sf_label(data = monitors %>% mutate(dist_dc = apply(st_distance(monitors, d_channel), 1, min)),
aes(label = paste0(Monitor, ' ', round(dist_dc, 2)))) +
theme_minimal()
dist_plot
remove(dist_plot, la_wrp, monitor_wrp, monitor_d_channel, d_channel)
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 4006932 214.0 6079797 324.7 6079797 324.7
## Vcells 324227355 2473.7 560834762 4278.9 560830006 4278.8
# klax_data <- read_csv('data/KLAX_2020-2023.csv', show_col_types = FALSE) %>%
# filter(!row_number() == 1)
ktoa_data <- read_csv('data/KTOA_2020-2023.csv', show_col_types = FALSE) %>%
filter(!row_number() == 1)
klgb_data <- read_csv('data/KLGB_2020-2023.csv', show_col_types = FALSE) %>%
filter(!row_number() == 1)
# compute daily average temperature, humidity, and precipitation
# klax_daily <- klax_data %>%
# group_by(Date) %>%
# summarise(avg_temp = mean(as.numeric(air_temp), na.rm=T),
# avg_hum = mean(as.numeric(relative_humidity), na.rm=T),
# precip = precip_accum_24_hour[which(!is.na(precip_accum_24_hour))[1]]) %>%
# mutate(precip = coalesce(as.numeric(precip), 0),
# Date = parse_date_time(Date, c('m/d/y')),
# Date = as.POSIXct(Date, "%Y-%m-%d", tz = 'UTC'))
klax_daily <- klax_daily %>%
select(-c(ws, wd))
klgb_daily <- klgb_data %>%
group_by(Date) %>%
summarise(avg_temp = mean(as.numeric(air_temp), na.rm=T),
avg_hum = mean(as.numeric(relative_humidity), na.rm=T),
precip = precip_accum_24_hour[which(!is.na(precip_accum_24_hour))[1]]) %>%
mutate(precip = coalesce(as.numeric(precip), 0),
Date = parse_date_time(Date, c('m/d/y')),
Date = as.POSIXct(Date, "%Y-%m-%d", tz = 'UTC'))
ktoa_daily <- ktoa_data %>%
group_by(Date) %>%
summarise(avg_temp = mean(as.numeric(air_temp), na.rm=T),
avg_hum = mean(as.numeric(relative_humidity), na.rm=T)) %>%
mutate(Date = parse_date_time(Date, c('m/d/y')),
Date = as.POSIXct(Date, "%Y-%m-%d", tz = 'UTC'))
remove(klax_data, ktoa_data, klgb_data)
# notice that the Torrance station is missing 10 days of observations
# for that 10 days, use the long beach airport data as substitutes
ktoa_daily <- bind_rows(ktoa_daily,
anti_join(klgb_daily,ktoa_daily, by = 'Date') %>% select(-precip))
# map each monitor to closest monitor
weather_stations <- tibble(station = c('KLAX', 'KLGB', 'KTOA'),
latitude = c(33.93806, 33.81167, 33.803979),
longitude = c(-118.33194, -118.38889, -118.339432)) %>%
st_as_sf(coords = c('longitude', 'latitude')) %>%
st_set_crs(4326)
weather_stations <- st_transform(weather_stations, CRS_UTM)
closest_temp_hum_station <- weather_stations$station[st_nearest_feature(monitors, weather_stations)]
closest_precip_station <- weather_stations$station[st_nearest_feature(monitors, weather_stations %>% filter(station != 'KTOA'))]
monitors_weather <- tibble(Monitor = monitors$Monitor,
weather_station = closest_temp_hum_station,
closest_precip_station = closest_precip_station)
weather_full <- bind_rows(klax_daily %>% select(-precip) %>%
mutate(weather_station = 'KLAX'),
klgb_daily %>% select(-precip) %>%
mutate(weather_station = 'KLGB'),
ktoa_daily %>%
mutate(weather_station = 'KTOA'))
precip_full <- bind_rows(klax_daily %>% select(Date, precip) %>%
mutate(closest_precip_station = 'KLAX'),
klgb_daily %>% select(Date, precip) %>%
mutate(closest_precip_station = 'KLGB'))
full_data <- full_data %>%
left_join(monitors_weather, join_by('Monitor')) %>%
left_join(weather_full, join_by('day' == 'Date', 'weather_station')) %>%
left_join(precip_full, join_by('day' == 'Date', 'closest_precip_station'))
remove(klax_daily, ktoa_daily, klgb_daily, precip_full, weather_full,
monitors_weather, weather_stations)
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 3965467 211.8 6079797 324.7 6079797 324.7
## Vcells 340334119 2596.6 812609506 6199.8 676897829 5164.4
well_prod <- read_csv('data/LA well prod Dec2023 Well Monthly Production.CSV', show_col_types = FALSE)
full_well_info <- read_csv('data/well_active_inactive_prod_since_2000.CSV', show_col_types = FALSE)
active_well_info <- read_csv('data/LA well prod Dec2023 Well Headers.CSV', show_col_types = FALSE)
inactive_well_info <- anti_join(full_well_info, active_well_info, join_by(API14)) %>%
rename(lon = `Surface Hole Longitude (WGS84)`,
lat = `Surface Hole Latitude (WGS84)`) %>%
select(`API14`, lon, lat) %>%
distinct() %>%
st_as_sf(coords = c('lon', 'lat')) %>%
st_set_crs(4326) %>%
st_transform(CRS_UTM)
well_prod <- well_prod %>%
left_join(active_well_info, join_by(`API/UWI` == API14))
well_prod <- well_prod %>%
select(`API/UWI`, `Monthly Production Date`, `Monthly Oil`, `Monthly Gas`,
`Monthly Water`, `Monthly BOE`, Days, `Operator Company Name.x`,
`Production Type.x`, `Well Status.x`, `Drill Type`,
`Surface Hole Latitude (WGS84)`, `Surface Hole Longitude (WGS84)`) %>%
rename(lon = `Surface Hole Longitude (WGS84)`,
lat = `Surface Hole Latitude (WGS84)`)
# project distinct well locations to UTM
active_well_location <- well_prod %>%
select(`API/UWI`, lon, lat) %>%
distinct() %>%
st_as_sf(coords = c('lon', 'lat')) %>%
st_set_crs(4326) %>%
st_transform(CRS_UTM)
# distance to closest monitor for each well
monitor_active_wells <- tibble()
for (i in 1:nrow(monitors)){
monitor_well_api <- drop_units(st_distance(monitors[i, ], active_well_location))
monitor_well_api <- active_well_location[which(monitor_well_api <= 2000), ] %>% st_drop_geometry()
monitor_active_wells <- rbind(monitor_active_wells,
tibble(Monitor = monitors[i, 1]$Monitor,
API14 = monitor_well_api$`API/UWI`))
}
monitor_well_prod <- monitor_active_wells %>%
left_join(well_prod, join_by(API14 == `API/UWI`)) %>%
group_by(Monitor, `Monthly Production Date`) %>%
summarise(active_2km = n(),
monthly_oil_2km = sum(`Monthly Oil`, na.rm = TRUE),
monthly_gas_2km = sum(`Monthly Gas`, na.rm = TRUE)) %>%
mutate(yearmonth = format(`Monthly Production Date`, '%Y-%m')) %>%
select(-`Monthly Production Date`)
## Warning in left_join(., well_prod, join_by(API14 == `API/UWI`)): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 1 of `x` matches multiple rows in `y`.
## ℹ Row 18032 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
## `summarise()` has grouped output by 'Monitor'. You can override using the
## `.groups` argument.
full_data <- full_data %>%
left_join(monitor_well_prod, join_by(Monitor, yearmonth)) %>%
mutate(active_2km = coalesce(active_2km, 0),
monthly_oil_2km = coalesce(monthly_oil_2km, 0),
monthly_gas_2km = coalesce(monthly_gas_2km, 0))
monitor_inactive_wells <- tibble()
for (i in 1:nrow(monitors)){
monitor_well_api <- drop_units(st_distance(monitors[i, ], inactive_well_info))
monitor_well_api <- inactive_well_info[which(monitor_well_api <= 2000), ] %>% st_drop_geometry()
monitor_inactive_wells <- rbind(monitor_inactive_wells,
tibble(Monitor = monitors[i, 1]$Monitor,
inactive_2km = nrow(monitor_well_api)))
}
full_data <- full_data %>%
left_join(monitor_inactive_wells, join_by(Monitor))
# get monthly production data at different distance levels (monitor to well)
# prod_data <- list(
# well_prod %>%
# filter(`1km` == TRUE) %>%
# group_by(`Monthly Production Date`) %>%
# summarise(active_1km = n(),
# monthly_oil_1km = sum(`Monthly Oil`, na.rm = TRUE),
# monthly_gas_1km = sum(`Monthly Gas`, na.rm = TRUE),
# monthly_water_1km = sum(`Monthly Water`, na.rm = TRUE),
# monthly_boe_1km = sum(`Monthly BOE`, na.rm = TRUE))
# ,
# well_prod %>%
# filter(`2p5km` == TRUE) %>%
# group_by(`Monthly Production Date`) %>%
# summarise(active_2p5km = n(),
# monthly_oil_2p5km = sum(`Monthly Oil`, na.rm = TRUE),
# monthly_gas_2p5km = sum(`Monthly Gas`, na.rm = TRUE),
# monthly_water_2p5km = sum(`Monthly Water`, na.rm = TRUE),
# monthly_boe_2p5km = sum(`Monthly BOE`, na.rm = TRUE))
# ,
# well_prod %>%
# filter(`5km` == TRUE) %>%
# group_by(`Monthly Production Date`) %>%
# summarise(active_5km = n(),
# monthly_oil_5km = sum(`Monthly Oil`, na.rm = TRUE),
# monthly_gas_5km = sum(`Monthly Gas`, na.rm = TRUE),
# monthly_water_5km = sum(`Monthly Water`, na.rm = TRUE),
# monthly_boe_5km = sum(`Monthly BOE`, na.rm = TRUE))
# ) %>%
# reduce(inner_join, by = 'Monthly Production Date') %>%
# mutate(yearmonth = format(`Monthly Production Date`, '%Y-%m'))
#
# full_data <- full_data %>%
# left_join(prod_data, join_by(yearmonth))
# cleanup columns
full_data <- full_data %>%
select(-c('Date.Time', 'MDL', 'Unit', 'Averaging.Hour', 'Join_Count', 'siteId', 'unitName',
'qcName', 'opName', 'windSpeed_Unit', 'windDirection_Unit', 'opCode', 'Source',
'DaylightSavings', 'QC.Code', 'wd_QCcode', 'ws_QCcode'))
# Create variable to indicate downwind wrt refinery
wind_diff <- abs(full_data$Converted_Angle - 180 - full_data$wd)
wind_diff <- ifelse(wind_diff > 180, 360 - wind_diff, wind_diff)
full_data$downwind_ref <- as.integer(wind_diff <= 15)
# Finally, check if wind direction is downwind w.r.t wrp
wind_diff <- abs(full_data$wrp_angle - full_data$wd)
wind_diff <- ifelse(wind_diff > 180, 360 - wind_diff, wind_diff)
full_data$downwind_wrp <- as.integer(wind_diff <= 15)
CRS_UTM <- CRS("+proj=utm +zone=11 ellps=WGS84")
monitors <- full_data %>%
select(Monitor, latitude, longitude) %>%
distinct() %>%
st_as_sf(coords = c('longitude', 'latitude')) %>%
st_set_crs(4326) %>%
st_transform(CRS_UTM)
monitors <- tibble(Monitor = monitors$Monitor,
mon_utm_x = st_coordinates(monitors)[,1],
mon_utm_y = st_coordinates(monitors)[,2])
full_data <- full_data %>%
left_join(monitors, join_by(Monitor))
# Compute daily average wd/ws
daily_full <- full_data %>%
select(H2S_daily_max, H2S_daily_avg, H2S_monthly_average, ws, wd, latitude,
longitude, mon_utm_x, mon_utm_y, Monitor, MinDist,
Converted_Angle, Refinery, year, month, day, weekday,
active_2km, monthly_oil_2km, monthly_gas_2km, inactive_2km,
dist_wrp, capacity, wrp_angle, dist_dc, dist_213,
avg_temp, avg_hum, precip) %>%
group_by(Monitor, day) %>%
mutate(ws_avg = mean(ws, na.rm=TRUE),
wd_avg = as.numeric(mean(circular(wd, units = 'degrees'), na.rm=TRUE))) %>%
mutate(wd_avg = if_else(wd_avg < 0, wd_avg+360, wd_avg)) %>%
ungroup() %>%
rename(monitor_lat = latitude, monitor_lon = longitude) %>%
select(-c(wd, ws)) %>%
distinct()
# Get the downwind indicators for daily data
wind_diff <- abs(daily_full$Converted_Angle - 180 - daily_full$wd_avg)
wind_diff <- ifelse(wind_diff > 180, 360 - wind_diff, wind_diff)
daily_full$daily_downwind_ref <- as.integer(wind_diff <= 15)
wind_diff <- abs(daily_full$wrp_angle - daily_full$wd_avg)
wind_diff <- ifelse(wind_diff > 180, 360 - wind_diff, wind_diff)
daily_full$daily_downwind_wrp <- as.integer(wind_diff <= 15)
monitors_coord <- full_data %>%
select(Monitor, latitude, longitude) %>%
distinct()
monitors <- monitors_coord %>%
select(Monitor)
coordinates(monitors_coord) <- ~ longitude + latitude
elevation <- raster('shapefiles/N33W119.hgt')
monitors <- cbind(monitors, extract(elevation, monitors_coord)) %>%
as.data.frame() %>%
rename(elevation = 2)
#evi <- raster('shapefiles/MOD13Q1.061__250m_16_days_EVI_doy2023145_aid0001.tif')
evi <- raster('shapefiles/MOD13Q1.006__250m_16_days_EVI_doy2022177_aid0001.tif')
monitors <- cbind(monitors, extract(evi, monitors_coord) * 0.0001) %>%
as.data.frame() %>%
rename(EVI = 3)
full_data <- full_data %>%
left_join(monitors, join_by(Monitor))
daily_full <- daily_full %>%
left_join(monitors, join_by(Monitor))
odor <- read_csv('data/odorcomplaintdata_2018_2023.csv')
## New names:
## Rows: 22926 Columns: 4
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," dbl
## (3): ...1, zip, num_odor_complaints date (1): date
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
la_county <- read_sf('shapefiles/Zip_Codes_(LA_County)/Zip_Codes_(LA_County).shp')
# https://gis.stackexchange.com/questions/282750/identify-polygon-containing-point-with-r-sf-package
pnts_sf <- do.call("st_sfc",c(lapply(1:nrow(monitors_coord@coords),
function(i) {st_point(as.numeric(monitors_coord@coords[i, ]))}), list("crs" = 4326)))
pnts_trans <- st_transform(pnts_sf, CRS_UTM) # apply transformation to pnts sf
la_county_trans <- st_transform(la_county, CRS_UTM) # apply transformation to polygons sf
# intersect and extract state name
monitors_coord@data$county <- apply(st_intersects(la_county_trans, pnts_trans, sparse = FALSE), 2,
function(col) {
la_county_trans[which(col), ]$ZIPCODE
})
library(ggrepel)
# Plot results to double check
la_county_present <- la_county[la_county$ZIPCODE %in% unique(monitors_coord$county), ]
monitor_zip <- tibble(Monitor = monitors_coord@data$Monitor,
zip = monitors_coord@data$county,
longitude = monitors_coord@coords[,1],
latitude = monitors_coord@coords[,2])
monitor_zip_graph <- ggplot() +
geom_sf(data = la_county_present, aes(fill = ZIPCODE)) +
geom_point(data = monitor_zip,
aes(x = longitude, y = latitude))
monitor_zip_graph +
geom_label_repel(data = monitor_zip, aes(x = longitude, y = latitude, label = Monitor),
box.padding = 0.35,
point.padding = 0.5,
segment.color = 'grey50') +
theme_minimal()
odor <- odor[odor$zip %in% unique(monitors_coord$county), ]
monitor_odor <- monitors_coord@data %>%
left_join(odor %>% mutate(zip = as.character(zip)), join_by(county == zip)) %>%
select(-`...1`)
## Warning in left_join(., odor %>% mutate(zip = as.character(zip)), join_by(county == : Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 1 of `x` matches multiple rows in `y`.
## ℹ Row 10 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
full_data <- full_data %>%
left_join(monitors_coord@data, join_by(Monitor))
daily_full <- daily_full %>%
left_join(monitors_coord@data, join_by(Monitor))
full_data <- full_data %>%
left_join(monitor_odor %>% select(Monitor, date, num_odor_complaints), join_by(Monitor, day == date))
full_data <- full_data %>%
mutate(num_odor_complaints = coalesce(num_odor_complaints, 0))
daily_full <- daily_full %>%
left_join(monitor_odor %>% select(Monitor, date, num_odor_complaints), join_by(Monitor, day == date))
daily_full <- daily_full %>%
mutate(num_odor_complaints = coalesce(num_odor_complaints, 0))
odor_graph <- odor %>%
filter(zip %in% unique(monitors_coord$county)) %>%
mutate(zip = as.character(zip)) %>%
ggplot(aes(x=date, y=num_odor_complaints, group=zip, color=zip)) +
geom_line() +
labs(title = "Number of odor complaints over time", y = 'Number of odor complaints', x = 'time') +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45))
ggplotly(odor_graph) %>% layout(dragmode = 'pan')
odor_graph_b <- odor %>%
filter(zip %in% unique(monitors_coord$county)) %>%
mutate(zip = as.character(zip)) %>%
ggplot(aes(x=date, y=num_odor_complaints, group=zip, color=zip)) +
geom_line() +
labs(title = "Number of odor complaints over time w/o outliers", y = 'Number of odor complaints', x = 'time') +
scale_y_continuous(limits = c(0, 30)) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45))
ggplotly(odor_graph_b) %>% layout(dragmode = 'pan')
saveRDS(full_data, 'data/full_data.rds')
saveRDS(daily_full, 'data/daily_full.rds')
remove(la_county, la_county_present, la_county_trans, monitor_odor, monitor_zip,
monitor_zip_graph, monitors, monitor_coord, odor, odor_graph, odor_graph_b,
pnts_sf, pnts_trans, elevation, evi)
## Warning in remove(la_county, la_county_present, la_county_trans, monitor_odor,
## : object 'monitor_coord' not found
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 4226784 225.8 7609004 406.4 6079797 324.7
## Vcells 331324847 2527.9 812609506 6199.8 812124887 6196.1